import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
data = pd.read_excel("sales_data.xlsx")
data.head()
data.describe()
The describe() method returns description of feature and their interrealtions-
count - The number of not-empty values in that column.
mean - The average (mean) value of that column.
std - The standard deviation of that column.
min - the minimum value in that column.
25% - 25% percentile of column.
50% - 50% percentile of column.
75% - 75% percentile of column.
max - the maximum value in that column.
data.info()
data.isnull().sum()
Number of unique value in each column
for x in data.columns:
print(x,'-',len(data[x].unique()))
data.corr()
#Histogram using Pandas
data.hist(figsize=(16,8))
plt.show()
# group data by total order and plot count plot
data.groupby('TOTAL_ORDERS').count().plot(kind='bar',figsize=(20,8))
#plt.legend(loc=10)
plt.show()
#line plot for REVENUE and TOTAL_ORDERS columns
plt.figure(figsize=(20,6))
sns.lineplot(x='TOTAL_ORDERS',y='REVENUE',data=data)
plt.show()
col = ["REVENUE","TOTAL_ORDERS"]
data[col].describe()
rfm_data = data[['CustomerID','DAYSSINCELASTORDER','AVGDAYSBETWEENORDERS','REVENUE']].rename\
({'DAYSSINCELASTORDER':'RECENCY','AVGDAYSBETWEENORDERS':'FREQUENCY','REVENUE':'MONETARY'},axis=1)
rfm_data
#Descriptive Statistics (Recency)
rfm_data.RECENCY.describe()
#Recency distribution plot
import seaborn as sns
x = rfm_data['RECENCY']
ax = sns.distplot(x)
plt.show()
#Descriptive Statistics (Frequency)
rfm_data.FREQUENCY.describe()
#Frequency distribution plot
import seaborn as sns
x = rfm_data['FREQUENCY']
ax = sns.distplot(x)
plt.show()
#Descriptive Statistics (Monetary)
rfm_data.MONETARY.describe()
#Monetary distribution plot
import seaborn as sns
x = rfm_data['MONETARY']
ax = sns.distplot(x)
plt.show()
from sklearn.preprocessing import StandardScaler
obj = StandardScaler()
scaled_data = pd.DataFrame(obj.fit_transform(rfm_data.drop('CustomerID',axis=1))).\
rename({0:'RECENCY',1:'FREQUENCY',2:'MONETARY'},axis=1)
scaled_data
final_data = pd.concat([data['CustomerID'],scaled_data],axis=1)
final_data
#Split into four segments using quantiles
quantiles = final_data.quantile(q=[0.25,0.5,0.75])
quantiles = quantiles.to_dict()
quantiles
#Functions to create R, F and M segments
def RScoring(x,p,d):
if x <= d[p][0.25]:
return 1
elif x <= d[p][0.50]:
return 2
elif x <= d[p][0.75]:
return 3
else:
return 4
def FnMScoring(x,p,d):
if x <= d[p][0.25]:
return 4
elif x <= d[p][0.50]:
return 3
elif x <= d[p][0.75]:
return 2
else:
return 1
#Calculate Add R, F and M segment value columns in the existing dataset to show R, F and M segment values
final_data['R'] = final_data['RECENCY'].apply(RScoring, args=('RECENCY',quantiles,))
final_data['F'] = final_data['FREQUENCY'].apply(FnMScoring, args=('FREQUENCY',quantiles,))
final_data['M'] = final_data['MONETARY'].apply(FnMScoring, args=('MONETARY',quantiles,))
final_data.head()
#Calculate and Add RFMGroup value column showing combined concatenated score of RFM
final_data['RFMGroup'] = final_data.R.map(str) + final_data.F.map(str) + final_data.M.map(str)
#Calculate and Add RFMScore value column showing total sum of RFMGroup values
final_data['RFMScore'] = final_data[['R', 'F', 'M']].sum(axis = 1)
final_data.head()
#Assign Loyalty Level to each customer
Loyalty_Level = ['need attention', 'Potential customers','champions']
Score_cuts = pd.qcut(final_data.RFMScore, q = 3, labels = Loyalty_Level)
final_data['RFM_Loyalty_Level'] = Score_cuts.values
final_data.head()
import plotly.express as px
df = px.data.iris()
fig = px.scatter_3d(final_data, x='R', y='F', z='M',color='RFM_Loyalty_Level')
fig.show()
plt.figure(figsize=(10,6))
sns.scatterplot(y='RFMScore',x='CustomerID',data=final_data,hue='RFM_Loyalty_Level')
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(multi_class='multinomial')
log_data = final_data[['R','F','M','RFM_Loyalty_Level']]
Data of our Need
log_data
#Labelling the Output Column
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
log_data['RFM_Lotalty_Level'] = encoder.fit_transform(log_data['RFM_Loyalty_Level'])
X = log_data.iloc[:,0:3]
X
y = log_data.iloc[:,-1]
y
log.fit(X,y)
sample = log_data.sample(100)
x1 = sample.iloc[:,0:3]
y1 = sample.iloc[:,-1]
y_pred = log.predict(x1)
y_pred
from sklearn.metrics import accuracy_score,confusion_matrix
print("Accuracy of Logistic Regression is",accuracy_score(y1,y_pred)*100,"%")
print("Logistic Regression Confusion Matrix\n")
pd.DataFrame(confusion_matrix(y1,y_pred),columns=list(range(0,3)))
from mlxtend.plotting import plot_decision_regions
plot_decision_regions(X.values, y.values, log, legend=2,filler_feature_values = {2:3})
plt.show()